1  Linear Regression

Author

Lin Yu

Published

January 20, 2025

1.1 Q1

Load the data frame tut1 into R from the file tut1.RData available on the course page. The data frame contains the variables x and y.

Code
library(here)
library(texreg)
library(rms)
library(kableExtra)
library(dplyr)
library(ggplot2)
options(prType = "latex")
options(prType='html')
load(here("data",'tut1.rdata'))

knitr::opts_chunk$set(warning = FALSE,
                      message = FALSE)
theme_set(
  theme_bw(base_size = 18) +
    theme(legend.position = "bottom")
)

Fit a simple linear regression with y as the response and x as the predictor

fit0 <- ols(y ~ x, data = tut1)
Note

or you can use glm (generalized linear model) or lm (linear model)

fit_glm <- glm(y~x, data= tut1)
fit_lm <- lm(y~x, data = tut1)

The results are identical!

Code
htmlreg(list(fit0, fit_lm,fit_glm), 
        ci.force = TRUE,
        #custom.coef.map = keepvars,
        custom.model.names = c("ols", "lm","glm"))
Statistical models
  ols lm glm
Intercept 1.22*    
  [ 0.80; 1.64]    
x -0.36* -0.36* -0.36*
  [-0.48; -0.24] [-0.48; -0.24] [-0.48; -0.24]
(Intercept)   1.22* 1.22*
    [ 0.80; 1.64] [ 0.80; 1.64]
Num. obs. 100 100 100
R2 0.27 0.27  
Adj. R2 0.27 0.27  
L.R. 32.07    
AIC     303.66
BIC     311.48
Log Likelihood     -148.83
Deviance     114.89
* Null hypothesis value outside the confidence interval.

Diagnostic plots for model checking

Tip

If the model is correctly specified, and all assumptions (known as LINE) are met, the residuals should be randomly scattered around zero

pred_glm <- tut1 %>% 
             mutate(
               yobs = y,
           yhat = predict(fit_glm),
           resid = y - yhat)

x_y_plot <- ggplot(pred_glm, aes(x,yobs) )+
  geom_point()+
  geom_smooth()

x_resid_plot <- ggplot(pred_glm, aes(x,resid) )+
  geom_point()+
  geom_smooth()


yhat_resid_plot <- ggplot(pred_glm, aes(yhat,resid) )+
  geom_point()+
  geom_smooth()

x_y_plot
x_resid_plot 
yhat_resid_plot

predicted result using restricted cubic spline (skyblue) versus simple linear (black)

fit_rcs <- ols(y ~ rcs(x, 3), data = tut1)

pred_rcs <- tut1 %>% 
  mutate(
    pred_rcs = predict(fit_rcs,data = tut1)
  )
  
pred_rcs %>% ggplot( aes(x =x, y = pred_rcs))+
 # geom_point(size = 0.01)+
  geom_line(color = "skyblue")+
 # geom_point(data =pred_glm, aes(x =x, y = yhat) ,size = 0.01, ) +
geom_line(data =pred_glm, aes(x =x, y = yhat),color = "black"  ,alpha = 0.7)

Check the diagnostic plots again (knots = 3)

pred_rcs <- tut1 %>% 
             mutate(
               yobs = y,
           yhat = predict(fit_rcs),
           resid = y - yhat)

x_y_plot <- ggplot(pred_rcs, aes(x,yobs) )+
  geom_point()+
  geom_smooth()

x_resid_plot <- ggplot(pred_rcs, aes(x,resid) )+
  geom_point()+
  geom_smooth()


yhat_resid_plot <- ggplot(pred_rcs, aes(yhat,resid) )+
  geom_point()+
  geom_smooth()

x_y_plot
x_resid_plot 
yhat_resid_plot

experiment with different knot values:

Code
fit_rcs <- ols(y ~ rcs(x, 5), data = tut1)
pred_rcs <- tut1 %>% 
             mutate(
               yobs = y,
           yhat = predict(fit_rcs),
           resid = y - yhat)

x_y_plot <- ggplot(pred_rcs, aes(x,yobs) )+
  geom_point()+
  geom_smooth()

x_resid_plot <- ggplot(pred_rcs, aes(x,resid) )+
  geom_point()+
  geom_smooth()


yhat_resid_plot <- ggplot(pred_rcs, aes(yhat,resid) )+
  geom_point()+
  geom_smooth()

x_y_plot
x_resid_plot 
yhat_resid_plot

Code
fit_rcs <- ols(y ~ rcs(x, 5), data = tut1)
pred_rcs <- tut1 %>% 
             mutate(
               yobs = y,
           yhat = predict(fit_rcs),
           resid = y - yhat)

x_y_plot <- ggplot(pred_rcs, aes(x,yobs) )+
  geom_point()+
  geom_smooth()

x_resid_plot <- ggplot(pred_rcs, aes(x,resid) )+
  geom_point()+
  geom_smooth()


yhat_resid_plot <- ggplot(pred_rcs, aes(yhat,resid) )+
  geom_point()+
  geom_smooth()

x_y_plot
x_resid_plot 
yhat_resid_plot

Conclusion: knots = 4 is good enough to fit the data! knots = 5 may lead to overfitting.

1.2 Q2

Q2 is a similar question but with more than one predictor!

Load the data frame FEV from the file FEV.RData. For these data, use the variable fev as the response and the rest as the explanatory covariates.

load(here('data','FEV.rdata'))
FEV %>% 
  head() %>% 
  kable()
id age fev height sex smoke
301 9 1.708 57.0 female non-current smoker
451 8 1.724 67.5 female non-current smoker
501 7 1.720 54.5 female non-current smoker
642 9 1.558 53.0 male non-current smoker
901 9 1.895 57.0 male non-current smoker
1701 8 2.336 61.0 female non-current smoker

Fit an additive linear model to fev using the other variables as the covariates. Evaluate whether any of the continuous variables should be fit as non-linear terms.

fit the linear model

fev_lm <- ols(fev ~ age + height + sex + smoke,
               data = FEV)

pred_fev <- FEV %>% 
             mutate(
               yobs = fev,
           yhat = predict(fev_lm),
           resid = yobs - yhat)

age_resid_plot <- ggplot(pred_fev, aes(age,resid) )+
  geom_point()+
  geom_smooth()

height_resid_plot <- ggplot(pred_fev, aes(height,resid) )+
  geom_point()+
  geom_smooth()

age_resid_plot
height_resid_plot

What do the residual plots indicate?

non-linear relationship!

Fit RCS:

fev_rcs <- ols(fev ~ rcs(age, 4) + rcs(height, 4) + sex + smoke,
                data = FEV)

Again, check the diagnostic plots:

pred_fev <- FEV %>% 
             mutate(
               yobs = fev,
           yhat = predict(fev_rcs),
           resid = yobs - yhat)

age_y_plot <- ggplot(pred_fev, aes(age,yobs) )+
  geom_point()+
  geom_smooth()

age_resid_plot <- ggplot(pred_fev, aes(age,resid) )+
  geom_point()+
  geom_smooth()


yhat_resid_plot <- ggplot(pred_fev, aes(yhat,resid) )+
  geom_point()+
  geom_smooth()

age_y_plot
age_resid_plot 
yhat_resid_plot

height_y_plot <- ggplot(pred_fev, aes(height,yobs) )+
  geom_point()+
  geom_smooth()

height_resid_plot <- ggplot(pred_fev, aes(height,resid) )+
  geom_point()+
  geom_smooth()


yhat_resid_plot <- ggplot(pred_fev, aes(yhat,resid) )+
  geom_point()+
  geom_smooth()

height_y_plot
height_resid_plot 
yhat_resid_plot